# The purpose of this script is to analyze users who tweet photos and compare them to those who tweet protest photos.
# NB: The script uses raw data.

###########
#
# GLOBALS
#
###########
#setwd('/path/to/replication/')

library(jsonlite)
library(dplyr)
library(stargazer)
library(ggplot2)
library(reshape)
library(ggstance)  # For better dodging
library(plotrix)  # For std.error calculation to dplyr

###########
#
# TWEETS WITH NON-PROTEST IMAGES
#
###########
data <- stream_in(file('./Data/comparingUsers/tweets_with_images.txt'))

# Filter for countries in study
#countries <- c('Venezuela', 'Venezuela, Bolivarian Republic of', 'Rusia', 'Rusland', 'Russia', 'Russie', 'Russland', 'Rússia', 'Ukraine', 'Hong Kong', 'Republic of Korea', 'Spagna', 'Spain', 'Spanien', 'Spanje', '대한민국', '香港')
countries <- c('Venezuela', 'Venezuela, Bolivarian Republic of', 'Ukraine', 'Hong Kong', 'Republic of Korea', 'Spagna', 'Spain', 'Spanien', 'Spanje', '대한민국', '香港', 'Pakistan', 'Pakistán', 'پاکستان', 'Hong Kong', '香港')
data <- data[data$place$country %in% countries,]

# Standardize country names
data$place$country[data$place$country == 'Venezuela, Bolivarian Republic of'] <- 'Venezuela'
data$place$country[data$place$country %in% c('Rusia', 'Rusland', 'Russie', 'Russland', 'Rússia')] <- 'Russia'
data$place$country[data$place$country == 'Republic of Korea'] <- 'SK'
data$place$country[data$place$country == '대한민국'] <- 'SK'
data$place$country[data$place$country == 'Hong Kong'] <- 'HK'
data$place$country[data$place$country == '香港'] <- 'HK'
data$place$country[data$place$country %in% c('Spagna', 'Spain', 'Spanien', 'Spanje')] <- 'Spain'
data$place$country[data$place$country %in% c('Pakistán', 'پاکستان')] <- 'Pakistan'
data$place$country[data$place$country %in% c('香港')] <- 'Hong Kong'



# Convert timestamp to datetime
data$user.created_at2 <- as.POSIXct(data$user$created_at, format="%a %b %d %H:%M:%S +0000 %Y", tz="GMT")
data$created_at2 <- as.POSIXct(data$created_at, format="%a %b %d %H:%M:%S +0000 %Y", tz="GMT")

# Get days on Twitter.  Round since do not care about hours.
data$daysOnTwitter <- data$user.created_at2 - data$created_at2  # Gives difference in minutes
data$daysOnTwitter <- as.numeric(abs(round(data$daysOnTwitter/(60*24))))

# dplyr does not work when the column names have a . or $, so need to change for the columns to aggregate
data$placeCountry <- data$place$country
data$followers <- data$user$followers_count
data$friends <- data$user$friends_count
data$statuses <- data$user$statuses_count

# This is with all users
data_short <- data %>% group_by(placeCountry) %>% summarize(meanFollowers=mean(followers, na.rm=TRUE), medianFollowers=median(followers, na.rm=TRUE), meanFollowing=mean(friends, na.rm=TRUE), medianFollowing=median(friends, na.rm=TRUE), meanStatuses=mean(statuses), medianStatuses=median(statuses, na.rm=TRUE), meanAge=mean(daysOnTwitter), medianAge=median(daysOnTwitter, na.rm=TRUE), count=n())
data_short$describe <- 'noprotest_allUsers'

# This is with all users, calculate SDs
data_short_se <- data %>% group_by(placeCountry) %>% summarize(meanFollowers_se=std.error(followers), medianFollowers_se=std.error(followers), meanFollowing_se=std.error(friends), medianFollowing_se=std.error(friends), meanStatuses_se=std.error(statuses), medianStatuses_se=std.error(statuses), meanAge_se=std.error(daysOnTwitter), medianAge_se=std.error(daysOnTwitter), count=n())
data_short_se$describe <- 'noprotest_allUsers'

# But some users are in there multiple times.  To deal with that, let's randomly sample a user when that user occurs multiple times.
# dplyr won't work with weird column names, so first have to narrow down.
data$userID <- data$user$id
data_narrow <- data[names(data) %in% c('placeCountry', 'followers', 'friends', 'statuses', 'daysOnTwitter', 'userID')]
data_unique <- data_narrow %>% group_by(userID) %>% sample_n(size=1) # error, maybe stupid columns.  Yes, now it works.

data_unique_short <- data_unique %>% group_by(placeCountry) %>% summarize(meanFollowers=mean(followers, na.rm=TRUE), medianFollowers=median(followers, na.rm=TRUE), meanFollowing=mean(friends, na.rm=TRUE), medianFollowing=median(friends, na.rm=TRUE), meanStatuses=mean(statuses), medianStatuses=median(statuses, na.rm=TRUE), meanAge=mean(daysOnTwitter), medianAge=median(daysOnTwitter, na.rm=TRUE), count=n())
data_unique_short$describe <- 'no_protest_uniqueUsers'

# This is with uqniue users, calculate SDs
data_unique_short_se <- data_unique %>% group_by(placeCountry) %>% summarize(meanFollowers_se=std.error(followers), medianFollowers_se=std.error(followers), meanFollowing_se=std.error(friends), medianFollowing_se=std.error(friends), meanStatuses_se=std.error(statuses), medianStatuses_se=std.error(statuses), meanAge_se=std.error(daysOnTwitter), medianAge_se=std.error(daysOnTwitter), count=n())
data_unique_short_se$describe <- 'no_protest_uniqueUsers'

# rbind
noprotest <- data.frame(rbind(data_short, data_unique_short))
noprotest_se <- data.frame(rbind(data_short_se, data_unique_short_se))

# Save
write.csv(noprotest, './Data/comparingUsers/summaryStats_images_noprotest.csv', row.names=FALSE)
write.csv(noprotest_se, './Data/comparingUsers/summaryStats_images_noprotest_se.csv', row.names=FALSE)
stargazer(noprotest, summary=FALSE, out='./Tables/summaryStats_images_noprotest.csv')



###########
#
# TWEETS WITH PROTEST IMAGES
#   NB: I could have made a function, but for only two files it does not seem worth it.
#   NB: Copy and pasted the above code chunk, did ctrl+f for data -> data2.  Then manual updating of file name, some strings.
###########
data2 <- stream_in(file('./Data/comparingUsers/tweets_with_protest_images.txt'))

# Filter for countries in study
#countries <- c('Venezuela', 'Venezuela, Bolivarian Republic of', 'Rusia', 'Rusland', 'Russia', 'Russie', 'Russland', 'Rússia', 'Ukraine', 'Hong Kong', 'Republic of Korea', 'Spagna', 'Spain', 'Spanien', 'Spanje', '대한민국', '香港')
countries <- c('Venezuela', 'Venezuela, Bolivarian Republic of', 'Ukraine', 'Hong Kong', 'Republic of Korea', 'Spagna', 'Spain', 'Spanien', 'Spanje', '대한민국', '香港', 'Pakistan', 'Pakistán', 'پاکستان', 'Hong Kong', '香港')
data2 <- data2[data2$place$country %in% countries,]

# Standardize country names
# Standardize country names
data2$place$country[data2$place$country == 'Venezuela, Bolivarian Republic of'] <- 'Venezuela'
data2$place$country[data2$place$country %in% c('Rusia', 'Rusland', 'Russie', 'Russland', 'Rússia')] <- 'Russia'
data2$place$country[data2$place$country == 'Republic of Korea'] <- 'SK'
data2$place$country[data2$place$country == '대한민국'] <- 'SK'
data2$place$country[data2$place$country == 'Hong Kong'] <- 'HK'
data2$place$country[data2$place$country == '香港'] <- 'HK'
data2$place$country[data2$place$country %in% c('Spagna', 'Spain', 'Spanien', 'Spanje')] <- 'Spain'
data2$place$country[data2$place$country %in% c('Pakistán', 'پاکستان')] <- 'Pakistan'
data2$place$country[data2$place$country %in% c('香港')] <- 'Hong Kong'

# Convert timestamp to datetime
data2$user.created_at2 <- as.POSIXct(data2$user$created_at, format="%a %b %d %H:%M:%S +0000 %Y", tz="GMT")
data2$created_at2 <- as.POSIXct(data2$created_at, format="%a %b %d %H:%M:%S +0000 %Y", tz="GMT")

# Get days on Twitter.  Round since do not care about hours.
data2$daysOnTwitter <- data2$user.created_at2 - data2$created_at2  # Gives difference in minutes
data2$daysOnTwitter <- as.numeric(abs(round(data2$daysOnTwitter/(60*24))))

# dplyr does not work when the column names have a . or $, so need to change for the columns to aggregate
data2$placeCountry <- data2$place$country
data2$followers <- data2$user$followers_count
data2$friends <- data2$user$friends_count
data2$statuses <- data2$user$statuses_count

# This is with all users
data2_short <- data2 %>% group_by(placeCountry) %>% summarize(meanFollowers=mean(followers, na.rm=TRUE), medianFollowers=median(followers, na.rm=TRUE), meanFollowing=mean(friends, na.rm=TRUE), medianFollowing=median(friends, na.rm=TRUE), meanStatuses=mean(statuses), medianStatuses=median(statuses, na.rm=TRUE), meanAge=mean(daysOnTwitter), medianAge=median(daysOnTwitter, na.rm=TRUE), count=n())
data2_short$describe <- 'protest_allUsers'

# This is with all users, calculate SDs
data2_short_se <- data2 %>% group_by(placeCountry) %>% summarize(meanFollowers_se=std.error(followers), medianFollowers_se=std.error(followers), meanFollowing_se=std.error(friends), medianFollowing_se=std.error(friends), meanStatuses_se=std.error(statuses), medianStatuses_se=std.error(statuses), meanAge_se=std.error(daysOnTwitter), medianAge_se=std.error(daysOnTwitter), count=n())
data2_short_se$describe <- 'protest_allUsers'

# But some users are in there multiple times.  To deal with that, let's randomly sample a user when that user occurs multiple times.
# dplyr won't work with weird column names, so first have to narrow down.
data2$userID <- data2$user$id
data2_narrow <- data2[names(data2) %in% c('placeCountry', 'followers', 'friends', 'statuses', 'daysOnTwitter', 'userID')]
data2_unique <- data2_narrow %>% group_by(userID) %>% sample_n(size=1) # error, maybe stupid columns.  Yes, now it works.

data2_unique_short <- data2_unique %>% group_by(placeCountry) %>% summarize(meanFollowers=mean(followers, na.rm=TRUE), medianFollowers=median(followers, na.rm=TRUE), meanFollowing=mean(friends, na.rm=TRUE), medianFollowing=median(friends, na.rm=TRUE), meanStatuses=mean(statuses), medianStatuses=median(statuses, na.rm=TRUE), meanAge=mean(daysOnTwitter), medianAge=median(daysOnTwitter, na.rm=TRUE), count=n())
data2_unique_short$describe <- 'protest_uniqueUsers'

# This is with uqniue users, calculate SDs
data2_unique_short_se <- data2_unique %>% group_by(placeCountry) %>% summarize(meanFollowers_se=std.error(followers), medianFollowers_se=std.error(followers), meanFollowing_se=std.error(friends), medianFollowing_se=std.error(friends), meanStatuses_se=std.error(statuses), medianStatuses_se=std.error(statuses), meanAge_se=std.error(daysOnTwitter), medianAge_se=std.error(daysOnTwitter), count=n())
data2_unique_short_se$describe <- 'protest_uniqueUsers'

# rbind
protest <- data.frame(rbind(data2_short, data2_unique_short))
protest_se <- data.frame(rbind(data2_short_se, data2_unique_short_se))

# Save
write.csv(protest, './Data/comparingUsers/summaryStats_images_protest.csv', row.names=FALSE)
write.csv(protest_se, './Data/comparingUsers/summaryStats_images_protest_se.csv', row.names=FALSE)
stargazer(protest, summary=FALSE, out='./Tables/summaryStats_images_protest.csv')

####################
#
#   PLOT
#
####################
# Load from above
protest <- read.csv('./Data/comparingUsers/summaryStats_images_protest.csv', stringsAsFactors=FALSE)
noprotest <- read.csv('./Data/comparingUsers/summaryStats_images_noprotest.csv', stringsAsFactors=FALSE)
protest_se <- read.csv('./Data/comparingUsers/summaryStats_images_protest_se.csv', stringsAsFactors=FALSE)
noprotest_se <- read.csv('./Data/comparingUsers/summaryStats_images_noprotest_se.csv', stringsAsFactors=FALSE)

# Split into whether want all tweets or just users
protest_unique <- protest[protest$describe=='protest_uniqueUsers',]
noprotest_unique <- noprotest[noprotest$describe=='no_protest_uniqueUsers',]

protest_unique_se <- protest_se[protest_se$describe=='protest_uniqueUsers',]
noprotest_unique_se <- noprotest_se[noprotest_se$describe=='no_protest_uniqueUsers',]

# Merge
uniqueUsers <- rbind(protest_unique, noprotest_unique)
uniqueUsers_se <- rbind(protest_unique_se, noprotest_unique_se)

# Melt so that ggplot will like
uniqueUsers <- melt(uniqueUsers, id.vars=c('placeCountry', 'describe'), variable_name='Measure')
uniqueUsers_se <- melt(uniqueUsers_se, id.vars=c('placeCountry', 'describe'), variable_name='Measure')


# Merge sd column in
sum(uniqueUsers$placeCountry == uniqueUsers_se$placeCountry) == nrow(uniqueUsers)  # Confirm they are aligned.  Nb that could technically still be different based on measurement column, but visual inspection confirms they are the same except for '_se'
uniqueUsers$se <- uniqueUsers_se$value

# Clean
names(uniqueUsers)[names(uniqueUsers) == 'describe'] <- 'Images'
uniqueUsers$Images <- gsub('_uniqueUsers', '', uniqueUsers$Images)
uniqueUsers$Images <- ifelse(uniqueUsers$Images == 'protest', 'Protest', 'No Protest')

uniqueUsers$Measure <- as.character(uniqueUsers$Measure)  # Need string for below
uniqueUsers$Measure[uniqueUsers$Measure == 'count'] <- 'Unique Users'
uniqueUsers$Measure[uniqueUsers$Measure == 'medianAge'] <- 'Med. Age'
uniqueUsers$Measure[uniqueUsers$Measure == 'meanAge'] <- 'Avg. Age'
uniqueUsers$Measure[uniqueUsers$Measure == 'medianStatuses'] <- 'Med. Tweets'
uniqueUsers$Measure[uniqueUsers$Measure == 'meanStatuses'] <- 'Avg. Tweets'
uniqueUsers$Measure[uniqueUsers$Measure == 'medianFollowing'] <- 'Med. Following'
uniqueUsers$Measure[uniqueUsers$Measure == 'meanFollowing'] <- 'Avg. Following'
uniqueUsers$Measure[uniqueUsers$Measure == 'medianFollowers'] <- 'Med. Followers'
uniqueUsers$Measure[uniqueUsers$Measure == 'meanFollowers'] <- 'Avg. Followers'
uniqueUsers$Measure <- as.factor(uniqueUsers$Measure)  # Put back to factor


# Keep just countries in main paper
uniqueUsers <- uniqueUsers[uniqueUsers$placeCountry %in% c('Spain', 'Venezuela', 'SK', 'HK', 'Hong Kong', 'Pakistan'),]

# Decided I don't want initials for the country names, don't want to change higher up because that would involve reloading data.
uniqueUsers$placeCountry[uniqueUsers$placeCountry=='HK'] <- 'Hong Kong'
uniqueUsers$placeCountry[uniqueUsers$placeCountry=='SK'] <- 'South Korea'



# Will not use median, so can plot error bars that make sense.
uniqueUsers <- uniqueUsers[grep('Med.', uniqueUsers$Measure, invert=TRUE),]

# Make figure
pdf('./Figures/comparePhotos_uniqueUsers_ProtestPhotos.pdf')
ggplot(uniqueUsers, aes(x=value, y=as.factor(Measure), group=Images, linetype=Images)) + geom_point(position=position_dodgev(height=1)) + geom_errorbarh(aes(xmax=value+1.96*se, xmin=value-1.96*se), position=position_dodgev(height=1)) + facet_grid(rows=vars(placeCountry)) + theme(axis.title.x=element_blank(), axis.title.y=element_blank()) + theme_classic()
dev.off()


